knitr::opts_chunk$set(
message = FALSE,
warning = FALSE
)
# Wymagane pakiety ----
library(tm) # Przetwarzanie tekstu
## Loading required package: NLP
library(SnowballC) # Stemming
library(cluster) # Klastrowanie
library(wordcloud) # Chmury słów
## Loading required package: RColorBrewer
library(factoextra) # Wizualizacje klastrów
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(RColorBrewer) # Kolory
library(ggplot2) # Wykresy
library(dplyr) # Przetwarzanie danych
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggrepel) # Dodawania etykiet w wykresach
library(DT) # Interaktywne tabele
# Dane tekstowe ----
# Ustaw Working Directory!
# Załaduj dokumenty z folderu
docs <- DirSource("/Users/adammnich/Desktop/zaj_7")
# W razie potrzeby dostosuj ścieżkę
# np.: docs <- DirSource("C:/User/Documents/textfolder")
# Utwórz korpus dokumentów tekstowych
corpus <- VCorpus(docs)
### Gdy tekst znajduje się w jednym pliku csv:
### data <- read.csv("file.csv", stringsAsFactors = FALSE, encoding = "UTF-8")
### corpus <- VCorpus(VectorSource(data$text))
# Korpus
inspect(corpus)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 11
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 7838
##
## [[2]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 5642
##
## [[3]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 3916
##
## [[4]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 2735
##
## [[5]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 6666
##
## [[6]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 3668
##
## [[7]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 3525
##
## [[8]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 4294
##
## [[9]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 14991
##
## [[10]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 14991
##
## [[11]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 14946
# Korpus - zawartość przykładowego elementu
corpus[[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 7838
## [1] "Yuichi is searching for a man who pushed his son, in the first-class compartment of the bullet train, but he finds Prince instead and is tasered by her."
## [2] ""
## [3] "Ladybug steals the briefcase from Lemon and Tangerine but is attacked by another assassin (this time from Mexico), the \"Wolf\" (Benito A. Martínez Ocasio) (a Mexican assassin and former kingpin of a drug cartel), who blames Ladybug for fatally poisoning his entire wedding party, including his newlywed wife (whom Ladybug had actually saved). Ladybug was about to get off the train at the next station, but was met by Wolf at the door, who attacks Ladybug. Ladybug doesn't recognize the Wolf and denies having poisoned his entire wedding party. But Ladybug was present at the party as a waiter."
## $meta
## author : character(0)
## datetimestamp: 2025-04-03 15:54:01.72640109062195
## description : character(0)
## heading : character(0)
## id : Action__Bullet_Train2022.txt
## language : en
## origin : character(0)
# 1. Przetwarzanie i oczyszczanie tekstu ----
# (Text Preprocessing and Text Cleaning)
# Normalizacja i usunięcie zbędnych znaków ----
# Zapewnienie kodowania w całym korpusie
corpus <- tm_map(corpus, content_transformer(function(x) iconv(x, to = "UTF-8", sub = "byte")))
# Funkcja do zamiany znaków na spację
toSpace <- content_transformer(function (x, pattern) gsub(pattern, " ", x))
# Usuń zbędne znaki lub pozostałości url, html itp.
# symbol @
corpus <- tm_map(corpus, toSpace, "@")
# symbol @ ze słowem (zazw. nazwa użytkownika)
corpus <- tm_map(corpus, toSpace, "@\\w+")
# linia pionowa
corpus <- tm_map(corpus, toSpace, "\\|")
# tabulatory
corpus <- tm_map(corpus, toSpace, "[ \t]{2,}")
# CAŁY adres URL:
corpus <- tm_map(corpus, toSpace, "(s?)(f|ht)tp(s?)://\\S+\\b")
# http i https
corpus <- tm_map(corpus, toSpace, "http\\w*")
# tylko ukośnik odwrotny (np. po http)
corpus <- tm_map(corpus, toSpace, "/")
# pozostałość po re-tweecie
corpus <- tm_map(corpus, toSpace, "(RT|via)((?:\\b\\W*@\\w+)+)")
# inne pozostałości
corpus <- tm_map(corpus, toSpace, "www")
corpus <- tm_map(corpus, toSpace, "~")
corpus <- tm_map(corpus, toSpace, "–")
# Sprawdzenie
corpus[[1]][[1]][7:9]
## [1] "Yuichi is searching for a man who pushed his son, in the first-class compartment of the bullet train, but he finds Prince instead and is tasered by her."
## [2] ""
## [3] "Ladybug steals the briefcase from Lemon and Tangerine but is attacked by another assassin (this time from Mexico), the \"Wolf\" (Benito A. Martínez Ocasio) (a Mexican assassin and former kingpin of a drug cartel), who blames Ladybug for fatally poisoning his entire wedding party, including his newlywed wife (whom Ladybug had actually saved). Ladybug was about to get off the train at the next station, but was met by Wolf at the door, who attacks Ladybug. Ladybug doesn't recognize the Wolf and denies having poisoned his entire wedding party. But Ladybug was present at the party as a waiter."
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
# Sprawdzenie
corpus[[1]][[1]][7:9]
## [1] "yuichi searching man pushed son firstclass compartment bullet train finds prince instead tasered "
## [2] ""
## [3] "ladybug steals briefcase lemon tangerine attacked another assassin time mexico wolf benito martínez ocasio mexican assassin former kingpin drug cartel blames ladybug fatally poisoning entire wedding party including newlywed wife ladybug actually saved ladybug get train next station met wolf door attacks ladybug ladybug recognize wolf denies poisoned entire wedding party ladybug present party waiter"
# usunięcie ewt. zbędnych nazw własnych
corpus <- tm_map(corpus, removeWords, c("rose", "roses", "kate", "kates", "iris", "tyler", "tylers", "javi", "javis", "reed", "josh", "joshs", "elliot", "elliots", "julian", "julians", "patrick", "patricks", "margot", "margots", "one", "however", "ladybug"))
corpus <- tm_map(corpus, stripWhitespace)
# Sprawdzenie
corpus[[1]][[1]][7:9]
## [1] "yuichi searching man pushed son firstclass compartment bullet train finds prince instead tasered "
## [2] ""
## [3] " steals briefcase lemon tangerine attacked another assassin time mexico wolf benito martínez ocasio mexican assassin former kingpin drug cartel blames fatally poisoning entire wedding party including newlywed wife actually saved get train next station met wolf door attacks recognize wolf denies poisoned entire wedding party present party waiter"
# Stemming ----
# zachowaj kopię korpusu
# do użycia jako dictionary w uzupełnianiu rdzeni
corpus_copy <- corpus
# wykonaj stemming w korpusie
corpus_stemmed <- tm_map(corpus, stemDocument)
# Sprawdzenie
corpus[[1]][[1]][7:9]
## [1] "yuichi searching man pushed son firstclass compartment bullet train finds prince instead tasered "
## [2] ""
## [3] " steals briefcase lemon tangerine attacked another assassin time mexico wolf benito martínez ocasio mexican assassin former kingpin drug cartel blames fatally poisoning entire wedding party including newlywed wife actually saved get train next station met wolf door attacks recognize wolf denies poisoned entire wedding party present party waiter"
# Sprawdzenie
corpus_stemmed[[1]][[1]][7:9]
## [1] "yuichi search man push son firstclass compart bullet train find princ instead taser"
## [2] ""
## [3] "steal briefcas lemon tangerin attack anoth assassin time mexico wolf benito martínez ocasio mexican assassin former kingpin drug cartel blame fatal poison entir wed parti includ newlyw wife actual save get train next station met wolf door attack recogn wolf deni poison entir wed parti present parti waiter"
# Uzupełnienie rdzeni słów po stemmingu ----
# funkcja pomocnicza: wykonuje stemCompletion linia po linii
complete_stems <- content_transformer(function(x, dict) {
x <- unlist(strsplit(x, " ")) # podziel na słowa
x <- stemCompletion(x, dictionary = corpus_copy, type="longest") # uzupełnij rdzenie
paste(x, collapse = " ") # połącz z powrotem w tekst
})
# wykonaj stemCompletion do każdego dokumentu w korpusie
corpus_completed <- tm_map(corpus_stemmed, complete_stems, dict = corpus_copy)
# usuń NA
corpus_completed <- tm_map(corpus_completed, toSpace, "NA")
corpus_completed <- tm_map(corpus_completed, stripWhitespace)
# Sprawdzenie
corpus_completed[[1]][[1]][1]
## [1] "tokyo yuichis kimura andrew koji seeking revenge sons wataru pushed rooftop told boards bulletproof train later night elders hiroyuki sanada meanwhile guided handler maria sandra bullock former assassinate brad pitt seasoned american assassinate suffering considers assigned retrieve briefcase bulletproof train bound kyoto previously contract carver callrbind sick initially recently stringsasfactors bad lucky jobs resulted accidentally deaths also train younger womans codenamed princes joey kingpin younger womans disguised schoolgirl manipulating assassinate considers attacking wataru two englishman assassinate brothers callrbind lemon brian tyree tangerineslater aaron taylorjohnson assigned escort briefcase ransom money malfunctioning sons sons russianborn yakuza boss known white deaths michael shannon japanese yakuza member minegishis former advisor hired dueling roles jobs bolivia sons logan lerman white deaths sons kidnapped prior eventually film minegishis japanese mob boss white deaths russianborn kgb working ranken eventually white deaths turns minegishis killing entire clan becomes mob boss white deaths wife died driven accidentally yuichis searching manipulating pushed sons firstclass compartment bulletproof train finding princes instead tasered steals briefcase lemon tangerineslater attacking another assassinate time mexico wolfs benito martínez ocasio mexican assassinate former kingpin drug cartel blames fatally poisoning entire wedding including newlywed wife actually saved getting train next station meteorology wolfs doors attacking recognizing wolfs denies poisoning entire wedding present waiter briefcase fighting wolfs knife throw rebounds briefcase heart leading deaths distraught stashes briefcase away arrangements wolfs corpse looking like sleeping passenger meanwhile princes revealing yuichis pushed wataru rooftop lure train partners elaborate planned assassinate white deaths well factorclusters henchman holding wataru hostage hospital orderlies killing anything happening princes knowing yuichis working white deaths wanting assassinate mob boss lemon tangerineslater searching missionaries briefcase white deaths sons poisoning died manner wolfs wedding months prior princes running tangerineslater saying saw briefcase tangerineslater looking tangerineslater wanting attacking meanwhile offers briefcase lemon returning getting train lemon suspects killing white deaths sons falsely admitted dueling lemon specifications knew killing someone believing meant wolfs leading fighting lemon knocking unconscious realization innocent upon awakening princes yuichis finding briefcase boobytrapped explosives killing white deaths well yuichis rigged gun second precaution encounters tangerineslater kicks train another scuffle manages climbs backyard aboard tangerineslater realization briefcase killing white deaths sons willing letter goodbye needed falls guy white deaths sons killing suspicious lemon shooting injuries yuichis finding yuichis princes together figure running shows train getting taken princes innocent schoolgirl activates believing yuichis kidnapped collapses drinking water spiked sleeping drug brought princes shooting lemon stashes yuichis bathroom encounters yeti another assassinate hornet zazie beetz american assassinate specializes poisoning disguised mascot poisoning white deaths sons wolfs wedding modified boomslang venom struggles exposing venom steals antivenom saved leaving died tangerineslater running princes realization shot lemon lemon puts sticker princes bad personally attacking candle shooting accidentally killing tangerineslater gun believing princes innocent agrees protect despite tangerineslater died pleasant yuichis father elders boards train seemingly princes lie recognizing sound voice informs wataru safe undercover bodyguard killing princes operative fleeing elders telling seeking revenge white deaths killing wife taken yakuza clan elders senior position minegishis clan decimated white deaths fate brought together ended discovering yuichis lemon bulletproof vest still alive albeit injuries fourth working together face white deaths kyoto gives briefcase white deaths princes revealing white deaths estranged daughter tries goad shooting yuichis rigged gun fails white deaths explaining assassinate train well sons responsible way deaths wife exception wolfs princes latters hired replace carver killing white deaths wife hired hopes killing white deaths explaining lemon tangerineslater killing crews bolivia goodbye deal sons got arrested white deaths wife went bail meteorology accidentally skilled heart surgeon saved poisoning hornet wife died operative table carver sent assassinate white deaths ended killing wife instead white deaths henchmen opener boobytrapped briefcase explodes knocking white deaths backyard onto train white deaths remaining henchmen boards battle assassinate elders dueling white deaths sword fighting fighting causing train hurtle controlled crashed downtown kyoto emergency wreck elders katana stuck chest white deaths tries killing blown rigged gun princes threatening yuichis elders machine gun proclaiming newspaper white deaths suddenly struck killing passenger fruit truck hauling tangerineslater revealing driven lemon avenging tangerineslater deaths process maria arriving retrieve celebrates finally getting bulletproof train japanese authorities arriving tries cleaning incredible damaged downtown kyoto assassinate exploitation causing"
# Porównaj:
corpus[[1]][[1]][7:9]
## [1] "yuichi searching man pushed son firstclass compartment bullet train finds prince instead tasered "
## [2] ""
## [3] " steals briefcase lemon tangerine attacked another assassin time mexico wolf benito martínez ocasio mexican assassin former kingpin drug cartel blames fatally poisoning entire wedding party including newlywed wife actually saved get train next station met wolf door attacks recognize wolf denies poisoned entire wedding party present party waiter"
corpus_stemmed[[1]][[1]][7:9]
## [1] "yuichi search man push son firstclass compart bullet train find princ instead taser"
## [2] ""
## [3] "steal briefcas lemon tangerin attack anoth assassin time mexico wolf benito martínez ocasio mexican assassin former kingpin drug cartel blame fatal poison entir wed parti includ newlyw wife actual save get train next station met wolf door attack recogn wolf deni poison entir wed parti present parti waiter"
# Decyzja dotycząca korpusu ----
# Należy w tym momencie rozważyć,
# który obiekt użyć do dalszej analizy:
#
# - corpus (oryginalny, bez stemmingu)
# - corpus_stemmed (po stemmingu)
# - corpus_completed (uzupełnione rdzenie)
# Tokenizacja ----
# Macierze częstości TDM i DTM ----
# a) Funkcja TermDocumentMatrix() ----
# tokeny = wiersze, dokumenty = kolumny
tdm <- TermDocumentMatrix(corpus_completed)
tdm
## <<TermDocumentMatrix (terms: 1492, documents: 11)>>
## Non-/sparse entries: 3020/13392
## Sparsity : 82%
## Maximal term length: 38
## Weighting : term frequency (tf)
## <<TermDocumentMatrix (terms: 1492, documents: 11)>>
## Non-/sparse entries: 3020/13392
## Sparsity : 82%
## Maximal term length: 38
## Weighting : term frequency (tf)
## Sample :
## Docs
## Terms Action__Bullet_Train2022.txt Action__Twisters2024.txt
## corpuscompleted 0 0
## dokumentów 0 0
## dtmmclusterdocsidx 0 0
## falsely 1 0
## klaster 0 0
## klastrów 0 0
## klastrowaniecluster 0 0
## słów 0 0
## tmmapcorpuscompleted 0 0
## wordcloudnameswordfreq 0 0
## Docs
## Terms Dark_Comedy__Menu2022.txt Supernatural__Smile2022.txt
## corpuscompleted 0 0
## dokumentów 0 0
## dtmmclusterdocsidx 0 0
## falsely 0 0
## klaster 0 0
## klastrów 0 0
## klastrowaniecluster 0 0
## słów 0 0
## tmmapcorpuscompleted 0 0
## wordcloudnameswordfreq 0 0
## Docs
## Terms Thriller_Psych__Heretic2024.txt
## corpuscompleted 0
## dokumentów 0
## dtmmclusterdocsidx 0
## falsely 0
## klaster 0
## klastrów 0
## klastrowaniecluster 0
## słów 0
## tmmapcorpuscompleted 0
## wordcloudnameswordfreq 0
## Docs
## Terms Thriller_SciFi__Caddo_Lake2024.txt
## corpuscompleted 0
## dokumentów 0
## dtmmclusterdocsidx 0
## falsely 0
## klaster 0
## klastrów 0
## klastrowaniecluster 0
## słów 0
## tmmapcorpuscompleted 0
## wordcloudnameswordfreq 0
## Docs
## Terms Thriller_SciFi__Companion2025.txt
## corpuscompleted 0
## dokumentów 0
## dtmmclusterdocsidx 0
## falsely 0
## klaster 0
## klastrów 0
## klastrowaniecluster 0
## słów 0
## tmmapcorpuscompleted 0
## wordcloudnameswordfreq 0
## Docs
## Terms Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R
## corpuscompleted 36
## dokumentów 41
## dtmmclusterdocsidx 19
## falsely 19
## klaster 24
## klastrów 33
## klastrowaniecluster 34
## słów 22
## tmmapcorpuscompleted 22
## wordcloudnameswordfreq 34
## Docs
## Terms Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R
## corpuscompleted 36
## dokumentów 41
## dtmmclusterdocsidx 19
## falsely 19
## klaster 24
## klastrów 33
## klastrowaniecluster 34
## słów 22
## tmmapcorpuscompleted 22
## wordcloudnameswordfreq 34
## Docs
## Terms Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd
## corpuscompleted 36
## dokumentów 41
## dtmmclusterdocsidx 19
## falsely 19
## klaster 24
## klastrów 33
## klastrowaniecluster 34
## słów 22
## tmmapcorpuscompleted 22
## wordcloudnameswordfreq 34
tdm_m <- as.matrix(tdm)
tdm_m[1:5, 1:5]
## Docs
## Terms Action__Bullet_Train2022.txt Action__Twisters2024.txt
## aaron 1 0
## abandoning 0 2
## abdominal 0 0
## ablaze 0 1
## aboard 1 0
## Docs
## Terms Dark_Comedy__Menu2022.txt Supernatural__Night_Swim2024.txt
## aaron 0 0
## abandoning 0 0
## abdominal 0 0
## ablaze 1 0
## aboard 0 0
## Docs
## Terms Supernatural__Smile2022.txt
## aaron 0
## abandoning 2
## abdominal 0
## ablaze 0
## aboard 0
# Można zapisać TDM w pliku .csv
# write.csv(tdm_m, file="TDM.csv")
# b) Funkcja DocumentTermMatrix() ----
# dokumenty = wiersze, tokeny = kolumny
dtm <- DocumentTermMatrix(corpus_completed)
dtm
## <<DocumentTermMatrix (documents: 11, terms: 1492)>>
## Non-/sparse entries: 3020/13392
## Sparsity : 82%
## Maximal term length: 38
## Weighting : term frequency (tf)
## <<DocumentTermMatrix (documents: 11, terms: 1492)>>
## Non-/sparse entries: 3020/13392
## Sparsity : 82%
## Maximal term length: 38
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs corpuscompleted
## Action__Bullet_Train2022.txt 0
## Action__Twisters2024.txt 0
## Dark_Comedy__Menu2022.txt 0
## Supernatural__Smile2022.txt 0
## Thriller_Psych__Heretic2024.txt 0
## Thriller_SciFi__Caddo_Lake2024.txt 0
## Thriller_SciFi__Companion2025.txt 0
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R 36
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R 36
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd 36
## Terms
## Docs dokumentów
## Action__Bullet_Train2022.txt 0
## Action__Twisters2024.txt 0
## Dark_Comedy__Menu2022.txt 0
## Supernatural__Smile2022.txt 0
## Thriller_Psych__Heretic2024.txt 0
## Thriller_SciFi__Caddo_Lake2024.txt 0
## Thriller_SciFi__Companion2025.txt 0
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R 41
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R 41
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd 41
## Terms
## Docs dtmmclusterdocsidx
## Action__Bullet_Train2022.txt 0
## Action__Twisters2024.txt 0
## Dark_Comedy__Menu2022.txt 0
## Supernatural__Smile2022.txt 0
## Thriller_Psych__Heretic2024.txt 0
## Thriller_SciFi__Caddo_Lake2024.txt 0
## Thriller_SciFi__Companion2025.txt 0
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R 19
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R 19
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd 19
## Terms
## Docs falsely
## Action__Bullet_Train2022.txt 1
## Action__Twisters2024.txt 0
## Dark_Comedy__Menu2022.txt 0
## Supernatural__Smile2022.txt 0
## Thriller_Psych__Heretic2024.txt 0
## Thriller_SciFi__Caddo_Lake2024.txt 0
## Thriller_SciFi__Companion2025.txt 0
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R 19
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R 19
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd 19
## Terms
## Docs klaster
## Action__Bullet_Train2022.txt 0
## Action__Twisters2024.txt 0
## Dark_Comedy__Menu2022.txt 0
## Supernatural__Smile2022.txt 0
## Thriller_Psych__Heretic2024.txt 0
## Thriller_SciFi__Caddo_Lake2024.txt 0
## Thriller_SciFi__Companion2025.txt 0
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R 24
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R 24
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd 24
## Terms
## Docs klastrów
## Action__Bullet_Train2022.txt 0
## Action__Twisters2024.txt 0
## Dark_Comedy__Menu2022.txt 0
## Supernatural__Smile2022.txt 0
## Thriller_Psych__Heretic2024.txt 0
## Thriller_SciFi__Caddo_Lake2024.txt 0
## Thriller_SciFi__Companion2025.txt 0
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R 33
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R 33
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd 33
## Terms
## Docs klastrowaniecluster
## Action__Bullet_Train2022.txt 0
## Action__Twisters2024.txt 0
## Dark_Comedy__Menu2022.txt 0
## Supernatural__Smile2022.txt 0
## Thriller_Psych__Heretic2024.txt 0
## Thriller_SciFi__Caddo_Lake2024.txt 0
## Thriller_SciFi__Companion2025.txt 0
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R 34
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R 34
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd 34
## Terms
## Docs słów
## Action__Bullet_Train2022.txt 0
## Action__Twisters2024.txt 0
## Dark_Comedy__Menu2022.txt 0
## Supernatural__Smile2022.txt 0
## Thriller_Psych__Heretic2024.txt 0
## Thriller_SciFi__Caddo_Lake2024.txt 0
## Thriller_SciFi__Companion2025.txt 0
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R 22
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R 22
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd 22
## Terms
## Docs tmmapcorpuscompleted
## Action__Bullet_Train2022.txt 0
## Action__Twisters2024.txt 0
## Dark_Comedy__Menu2022.txt 0
## Supernatural__Smile2022.txt 0
## Thriller_Psych__Heretic2024.txt 0
## Thriller_SciFi__Caddo_Lake2024.txt 0
## Thriller_SciFi__Companion2025.txt 0
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R 22
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R 22
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd 22
## Terms
## Docs wordcloudnameswordfreq
## Action__Bullet_Train2022.txt 0
## Action__Twisters2024.txt 0
## Dark_Comedy__Menu2022.txt 0
## Supernatural__Smile2022.txt 0
## Thriller_Psych__Heretic2024.txt 0
## Thriller_SciFi__Caddo_Lake2024.txt 0
## Thriller_SciFi__Companion2025.txt 0
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R 34
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R 34
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd 34
dtm_m <- as.matrix(dtm)
dtm_m[1:5, 1:5]
## Terms
## Docs aaron abandoning abdominal ablaze aboard
## Action__Bullet_Train2022.txt 1 0 0 0 1
## Action__Twisters2024.txt 0 2 0 1 0
## Dark_Comedy__Menu2022.txt 0 0 0 1 0
## Supernatural__Night_Swim2024.txt 0 0 0 0 0
## Supernatural__Smile2022.txt 0 2 0 0 0
# Można zapisać DTM w pliku .csv
# write.csv(dtm_m, file="DTM.csv")
# 2. Zliczanie częstości słów ----
# (Word Frequency Count)
# Można zliczyć same częstości słów w macierzach
# dla TDM i DTM da to identyczny rezultat
v <- sort(rowSums(tdm_m), decreasing = TRUE)
tdm_df <- data.frame(word = names(v), freq = v)
head(tdm_df, 10)
v2 <- sort(colSums(dtm_m), decreasing = TRUE)
dtm_df <- data.frame(word = names(v2), freq = v2)
head(dtm_df, 10)
# 3. Eksploracyjna analiza danych ----
# (Exploratory Data Analysis, EDA)
# Chmura słów (globalna)
wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7,
colors = brewer.pal(8, "Dark2"))
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## wordcloudnameswordfreq could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## corpuscompleted could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## zapewnienie could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## thememinimalbasesize could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## topwords could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## pastenameswordfreq could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## dokumentów could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## interaktywna could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## sprawdzenie could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## klastrów could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## documentnames could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## textfolder could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## dokumentom could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## true could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## removepunctuation could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## wordfreq could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## house could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## przypisania could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## tmmapcorpuscompleted could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## girlfriend could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## listpagelength could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## carefully could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## neardeath could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## drops could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## plików could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## lengthclusterdocsidx could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## leftjoindocumentsclusters could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## clusterinfodf could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## deaths could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## colsumsclusterdocs could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## dtmmclusterdocsidx could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## finding could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## stopwordsenglish could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## attacking could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## klastrowaniecluster could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## dla could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## fvizclusterlistdata could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## killing could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## klastra could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## stemmingu could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## corpusstemmed could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## previously could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## termdocumentmatrixcorpuscompleted could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## documentsclusterszinfo could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## settings could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## princes could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## przetwarzanie could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## ustaw could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## kmeansdtmm could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## tokenizacja could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## pełnym could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## docnames could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## printdocumentsclusters could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## tangerineslater could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tdm_df$word, freq = tdm_df$freq, min.freq = 7, :
## briefcase could not be fit on page. It will not be plotted.
# Wyświetl top 10
print(head(tdm_df, 10))
## word freq
## dokumentów dokumentów 123
## corpuscompleted corpuscompleted 108
## klastrowaniecluster klastrowaniecluster 102
## wordcloudnameswordfreq wordcloudnameswordfreq 102
## klastrów klastrów 99
## klaster klaster 72
## słów słów 66
## tmmapcorpuscompleted tmmapcorpuscompleted 66
## falsely falsely 58
## dtmmclusterdocsidx dtmmclusterdocsidx 57
# 4. Inżynieria cech w modelu Bag of Words: ----
# Reprezentacja słów i dokumentów w przestrzeni wektorowej ----
# (Feature Engineering in vector-space BoW model)
# - podejście surowych częstości słów
# (częstość słowa = liczba wystąpień w dokumencie)
# (Raw Word Counts)
# Użyj utworzonej wcześniej macierzy DTM
dtm
## <<DocumentTermMatrix (documents: 11, terms: 1492)>>
## Non-/sparse entries: 3020/13392
## Sparsity : 82%
## Maximal term length: 38
## Weighting : term frequency (tf)
## <<DocumentTermMatrix (documents: 11, terms: 1492)>>
## Non-/sparse entries: 3020/13392
## Sparsity : 82%
## Maximal term length: 38
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs corpuscompleted
## Action__Bullet_Train2022.txt 0
## Action__Twisters2024.txt 0
## Dark_Comedy__Menu2022.txt 0
## Supernatural__Smile2022.txt 0
## Thriller_Psych__Heretic2024.txt 0
## Thriller_SciFi__Caddo_Lake2024.txt 0
## Thriller_SciFi__Companion2025.txt 0
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R 36
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R 36
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd 36
## Terms
## Docs dokumentów
## Action__Bullet_Train2022.txt 0
## Action__Twisters2024.txt 0
## Dark_Comedy__Menu2022.txt 0
## Supernatural__Smile2022.txt 0
## Thriller_Psych__Heretic2024.txt 0
## Thriller_SciFi__Caddo_Lake2024.txt 0
## Thriller_SciFi__Companion2025.txt 0
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R 41
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R 41
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd 41
## Terms
## Docs dtmmclusterdocsidx
## Action__Bullet_Train2022.txt 0
## Action__Twisters2024.txt 0
## Dark_Comedy__Menu2022.txt 0
## Supernatural__Smile2022.txt 0
## Thriller_Psych__Heretic2024.txt 0
## Thriller_SciFi__Caddo_Lake2024.txt 0
## Thriller_SciFi__Companion2025.txt 0
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R 19
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R 19
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd 19
## Terms
## Docs falsely
## Action__Bullet_Train2022.txt 1
## Action__Twisters2024.txt 0
## Dark_Comedy__Menu2022.txt 0
## Supernatural__Smile2022.txt 0
## Thriller_Psych__Heretic2024.txt 0
## Thriller_SciFi__Caddo_Lake2024.txt 0
## Thriller_SciFi__Companion2025.txt 0
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R 19
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R 19
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd 19
## Terms
## Docs klaster
## Action__Bullet_Train2022.txt 0
## Action__Twisters2024.txt 0
## Dark_Comedy__Menu2022.txt 0
## Supernatural__Smile2022.txt 0
## Thriller_Psych__Heretic2024.txt 0
## Thriller_SciFi__Caddo_Lake2024.txt 0
## Thriller_SciFi__Companion2025.txt 0
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R 24
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R 24
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd 24
## Terms
## Docs klastrów
## Action__Bullet_Train2022.txt 0
## Action__Twisters2024.txt 0
## Dark_Comedy__Menu2022.txt 0
## Supernatural__Smile2022.txt 0
## Thriller_Psych__Heretic2024.txt 0
## Thriller_SciFi__Caddo_Lake2024.txt 0
## Thriller_SciFi__Companion2025.txt 0
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R 33
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R 33
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd 33
## Terms
## Docs klastrowaniecluster
## Action__Bullet_Train2022.txt 0
## Action__Twisters2024.txt 0
## Dark_Comedy__Menu2022.txt 0
## Supernatural__Smile2022.txt 0
## Thriller_Psych__Heretic2024.txt 0
## Thriller_SciFi__Caddo_Lake2024.txt 0
## Thriller_SciFi__Companion2025.txt 0
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R 34
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R 34
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd 34
## Terms
## Docs słów
## Action__Bullet_Train2022.txt 0
## Action__Twisters2024.txt 0
## Dark_Comedy__Menu2022.txt 0
## Supernatural__Smile2022.txt 0
## Thriller_Psych__Heretic2024.txt 0
## Thriller_SciFi__Caddo_Lake2024.txt 0
## Thriller_SciFi__Companion2025.txt 0
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R 22
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R 22
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd 22
## Terms
## Docs tmmapcorpuscompleted
## Action__Bullet_Train2022.txt 0
## Action__Twisters2024.txt 0
## Dark_Comedy__Menu2022.txt 0
## Supernatural__Smile2022.txt 0
## Thriller_Psych__Heretic2024.txt 0
## Thriller_SciFi__Caddo_Lake2024.txt 0
## Thriller_SciFi__Companion2025.txt 0
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R 22
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R 22
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd 22
## Terms
## Docs wordcloudnameswordfreq
## Action__Bullet_Train2022.txt 0
## Action__Twisters2024.txt 0
## Dark_Comedy__Menu2022.txt 0
## Supernatural__Smile2022.txt 0
## Thriller_Psych__Heretic2024.txt 0
## Thriller_SciFi__Caddo_Lake2024.txt 0
## Thriller_SciFi__Companion2025.txt 0
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R 34
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R 34
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd 34
## Terms
## Docs aaron abandoning abdominal ablaze aboard
## Action__Bullet_Train2022.txt 1 0 0 0 1
## Action__Twisters2024.txt 0 2 0 1 0
## Dark_Comedy__Menu2022.txt 0 0 0 1 0
## Supernatural__Night_Swim2024.txt 0 0 0 0 0
## Supernatural__Smile2022.txt 0 2 0 0 0
# UCZENIE MASZYNOWE NIENADZOROWANE ----
# (Unsupervised Machine Learning)
# Klastrowanie k-średnich (k-means) ----
# Dobór liczby klastrów
# Metoda sylwetki (silhouette)
fviz_nbclust(t(dtm_m), kmeans, method = "silhouette") +
labs(title = "Dobór liczby klastrów", subtitle = "Metoda sylwetki")

# Wykonaj klastrowanie kmeans
# (sprawdź wyniki dla k = 3,4,5)
set.seed(123) # ziarno losowe dla replikacji wyników
# a) Ustaw liczbę klastrów k = 2 ----
k <- 2 # ustaw liczbę klastrów
klastrowanie <- kmeans(dtm_m, centers = k)
# Wizualizacja klastrów
fviz_cluster(list(data = dtm_m, cluster = klastrowanie$cluster),
geom = "point",
main = "Wizualizacja klastrów dokumentów")

# Interaktywna tabela z przypisaniem dokumentów i top 5 słów
# Dla każdego klastra: liczba dokumentów oraz top 5 słów
cluster_info <- lapply(1:k, function(i) {
cluster_docs_idx <- which(klastrowanie$cluster == i)
cluster_docs <- dtm_m[cluster_docs_idx, , drop = FALSE]
word_freq <- sort(colSums(cluster_docs), decreasing = TRUE)
top_words <- paste(names(word_freq)[1:5], collapse = ", ")
data.frame(
Klaster = i,
Liczba_dokumentów = length(cluster_docs_idx),
Top_5_słów = top_words,
stringsAsFactors = FALSE
)
})
# Połącz wszystko w ramkę danych
cluster_info_df <- do.call(rbind, cluster_info)
# Nazwy dokumentów z korpusu
document_names <- names(corpus)
# Tabela przypisania dokumentów do klastrów
documents_clusters <- data.frame(
Dokument = document_names,
Klaster = klastrowanie$cluster,
stringsAsFactors = FALSE
)
# Dołączamy dane z podsumowania (JOIN po klastrze)
documents_clusters_z_info <- left_join(documents_clusters, cluster_info_df, by = "Klaster")
# Interaktywna tabela z pełnym podsumowaniem
datatable(documents_clusters_z_info,
caption = "Dokumenty, klastry, najczęstsze słowa i liczność klastrów",
rownames = FALSE,
options = list(pageLength = 10))
# Chmury słów dla każdego klastra
for (i in 1:k) {
# znajdź indeksy dokumentów w danym klastrze
cluster_docs_idx <- which(klastrowanie$cluster == i)
# nazwy plików odpowiadające dokumentom w tym klastrze
doc_names <- names(klastrowanie$cluster)[cluster_docs_idx]
# generuj chmurę słów dla klastra
cluster_docs <- dtm_m[cluster_docs_idx, , drop = FALSE]
word_freq <- colSums(cluster_docs)
wordcloud(names(word_freq), freq = word_freq,
max.words = 15, colors = brewer.pal(8, "Dark2"))
title(paste("Chmura słów - Klaster", i))
}

## Warning in wordcloud(names(word_freq), freq = word_freq, max.words = 15, :
## wordcloudnameswordfreq could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(word_freq), freq = word_freq, max.words = 15, :
## klastrowaniecluster could not be fit on page. It will not be plotted.

# a) Przypisanie dokumentów do klastrów ----
document_names <- names(corpus) # Nazwy dokumentów z korpusu
clusters <- klastrowanie$cluster # Przypisanie dokumentów do klastrów
# Ramka danych: dokumenty i ich klastry
documents_clusters <- data.frame(Dokument = document_names,
Klaster = as.factor(clusters))
# Podgląd
print(documents_clusters)
## Dokument
## Action__Bullet_Train2022.txt Action__Bullet_Train2022.txt
## Action__Twisters2024.txt Action__Twisters2024.txt
## Dark_Comedy__Menu2022.txt Dark_Comedy__Menu2022.txt
## Supernatural__Night_Swim2024.txt Supernatural__Night_Swim2024.txt
## Supernatural__Smile2022.txt Supernatural__Smile2022.txt
## Thriller_Psych__Heretic2024.txt Thriller_Psych__Heretic2024.txt
## Thriller_SciFi__Caddo_Lake2024.txt Thriller_SciFi__Caddo_Lake2024.txt
## Thriller_SciFi__Companion2025.txt Thriller_SciFi__Companion2025.txt
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd
## Klaster
## Action__Bullet_Train2022.txt 1
## Action__Twisters2024.txt 1
## Dark_Comedy__Menu2022.txt 1
## Supernatural__Night_Swim2024.txt 1
## Supernatural__Smile2022.txt 1
## Thriller_Psych__Heretic2024.txt 1
## Thriller_SciFi__Caddo_Lake2024.txt 1
## Thriller_SciFi__Companion2025.txt 1
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R 2
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R 2
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd 2
# a) Wizualizacja przypisania dokumentów do klastrów ----
ggplot(documents_clusters, aes(x = reorder(Dokument, Klaster), fill = Klaster)) +
geom_bar(stat = "count", width = 0.7) +
coord_flip() +
labs(title = "Przypisanie dokumentów do klastrów",
x = "Dokument",
y = "Liczba wystąpień (powinna wynosić 1)",
fill = "Klaster") +
theme_minimal(base_size = 13)
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

# b) Ustaw liczbę klastrów k = 3 ----
k <- 3 # ustaw liczbę klastrów
klastrowanie <- kmeans(dtm_m, centers = k)
# Wizualizacja klastrów
fviz_cluster(list(data = dtm_m, cluster = klastrowanie$cluster),
geom = "point",
main = "Wizualizacja klastrów dokumentów")

# Interaktywna tabela z przypisaniem dokumentów i top 5 słów
# Dla każdego klastra: liczba dokumentów oraz top 5 słów
cluster_info <- lapply(1:k, function(i) {
cluster_docs_idx <- which(klastrowanie$cluster == i)
cluster_docs <- dtm_m[cluster_docs_idx, , drop = FALSE]
word_freq <- sort(colSums(cluster_docs), decreasing = TRUE)
top_words <- paste(names(word_freq)[1:5], collapse = ", ")
data.frame(
Klaster = i,
Liczba_dokumentów = length(cluster_docs_idx),
Top_5_słów = top_words,
stringsAsFactors = FALSE
)
})
# Połącz wszystko w ramkę danych
cluster_info_df <- do.call(rbind, cluster_info)
# Nazwy dokumentów z korpusu
document_names <- names(corpus)
# Tabela przypisania dokumentów do klastrów
documents_clusters <- data.frame(
Dokument = document_names,
Klaster = klastrowanie$cluster,
stringsAsFactors = FALSE
)
# Dołączamy dane z podsumowania (JOIN po klastrze)
documents_clusters_z_info <- left_join(documents_clusters, cluster_info_df, by = "Klaster")
# Interaktywna tabela z pełnym podsumowaniem
datatable(documents_clusters_z_info,
caption = "Dokumenty, klastry, najczęstsze słowa i liczność klastrów",
rownames = FALSE,
options = list(pageLength = 10))
# Chmury słów dla każdego klastra
for (i in 1:k) {
# znajdź indeksy dokumentów w danym klastrze
cluster_docs_idx <- which(klastrowanie$cluster == i)
# nazwy plików odpowiadające dokumentom w tym klastrze
doc_names <- names(klastrowanie$cluster)[cluster_docs_idx]
# generuj chmurę słów dla klastra
cluster_docs <- dtm_m[cluster_docs_idx, , drop = FALSE]
word_freq <- colSums(cluster_docs)
wordcloud(names(word_freq), freq = word_freq,
max.words = 15, colors = brewer.pal(8, "Dark2"))
title(paste("Chmura słów - Klaster", i))
}


## Warning in wordcloud(names(word_freq), freq = word_freq, max.words = 15, :
## klastrowaniecluster could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(word_freq), freq = word_freq, max.words = 15, :
## wordcloudnameswordfreq could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(word_freq), freq = word_freq, max.words = 15, :
## corpuscompleted could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(word_freq), freq = word_freq, max.words = 15, :
## dokumentów could not be fit on page. It will not be plotted.

# b) Przypisanie dokumentów do klastrów ----
document_names <- names(corpus) # Nazwy dokumentów z korpusu
clusters <- klastrowanie$cluster # Przypisanie dokumentów do klastrów
# Ramka danych: dokumenty i ich klastry
documents_clusters <- data.frame(Dokument = document_names,
Klaster = as.factor(clusters))
# Podgląd
print(documents_clusters)
## Dokument
## Action__Bullet_Train2022.txt Action__Bullet_Train2022.txt
## Action__Twisters2024.txt Action__Twisters2024.txt
## Dark_Comedy__Menu2022.txt Dark_Comedy__Menu2022.txt
## Supernatural__Night_Swim2024.txt Supernatural__Night_Swim2024.txt
## Supernatural__Smile2022.txt Supernatural__Smile2022.txt
## Thriller_Psych__Heretic2024.txt Thriller_Psych__Heretic2024.txt
## Thriller_SciFi__Caddo_Lake2024.txt Thriller_SciFi__Caddo_Lake2024.txt
## Thriller_SciFi__Companion2025.txt Thriller_SciFi__Companion2025.txt
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd
## Klaster
## Action__Bullet_Train2022.txt 2
## Action__Twisters2024.txt 1
## Dark_Comedy__Menu2022.txt 1
## Supernatural__Night_Swim2024.txt 1
## Supernatural__Smile2022.txt 1
## Thriller_Psych__Heretic2024.txt 1
## Thriller_SciFi__Caddo_Lake2024.txt 1
## Thriller_SciFi__Companion2025.txt 1
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R 3
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R 3
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd 3
# b) Wizualizacja przypisania dokumentów do klastrów ----
ggplot(documents_clusters, aes(x = reorder(Dokument, Klaster), fill = Klaster)) +
geom_bar(stat = "count", width = 0.7) +
coord_flip() +
labs(title = "Przypisanie dokumentów do klastrów",
x = "Dokument",
y = "Liczba wystąpień (powinna wynosić 1)",
fill = "Klaster") +
theme_minimal(base_size = 13)
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

# c) Ustaw liczbę klastrów k = 4 ----
k <- 4 # ustaw liczbę klastrów
klastrowanie <- kmeans(dtm_m, centers = k)
# Wizualizacja klastrów
fviz_cluster(list(data = dtm_m, cluster = klastrowanie$cluster),
geom = "point",
main = "Wizualizacja klastrów dokumentów")

# Interaktywna tabela z przypisaniem dokumentów i top 5 słów
# Dla każdego klastra: liczba dokumentów oraz top 5 słów
cluster_info <- lapply(1:k, function(i) {
cluster_docs_idx <- which(klastrowanie$cluster == i)
cluster_docs <- dtm_m[cluster_docs_idx, , drop = FALSE]
word_freq <- sort(colSums(cluster_docs), decreasing = TRUE)
top_words <- paste(names(word_freq)[1:5], collapse = ", ")
data.frame(
Klaster = i,
Liczba_dokumentów = length(cluster_docs_idx),
Top_5_słów = top_words,
stringsAsFactors = FALSE
)
})
# Połącz wszystko w ramkę danych
cluster_info_df <- do.call(rbind, cluster_info)
# Nazwy dokumentów z korpusu
document_names <- names(corpus)
# Tabela przypisania dokumentów do klastrów
documents_clusters <- data.frame(
Dokument = document_names,
Klaster = klastrowanie$cluster,
stringsAsFactors = FALSE
)
# Dołączamy dane z podsumowania (JOIN po klastrze)
documents_clusters_z_info <- left_join(documents_clusters, cluster_info_df, by = "Klaster")
# Interaktywna tabela z pełnym podsumowaniem
datatable(documents_clusters_z_info,
caption = "Dokumenty, klastry, najczęstsze słowa i liczność klastrów",
rownames = FALSE,
options = list(pageLength = 10))
# Chmury słów dla każdego klastra
for (i in 1:k) {
# znajdź indeksy dokumentów w danym klastrze
cluster_docs_idx <- which(klastrowanie$cluster == i)
# nazwy plików odpowiadające dokumentom w tym klastrze
doc_names <- names(klastrowanie$cluster)[cluster_docs_idx]
# generuj chmurę słów dla klastra
cluster_docs <- dtm_m[cluster_docs_idx, , drop = FALSE]
word_freq <- colSums(cluster_docs)
wordcloud(names(word_freq), freq = word_freq,
max.words = 15, colors = brewer.pal(8, "Dark2"))
title(paste("Chmura słów - Klaster", i))
}
## Warning in wordcloud(names(word_freq), freq = word_freq, max.words = 15, :
## wordcloudnameswordfreq could not be fit on page. It will not be plotted.
## Warning in wordcloud(names(word_freq), freq = word_freq, max.words = 15, :
## klastrowaniecluster could not be fit on page. It will not be plotted.

## Warning in wordcloud(names(word_freq), freq = word_freq, max.words = 15, :
## returning could not be fit on page. It will not be plotted.



# c) Przypisanie dokumentów do klastrów ----
document_names <- names(corpus) # Nazwy dokumentów z korpusu
clusters <- klastrowanie$cluster # Przypisanie dokumentów do klastrów
# Ramka danych: dokumenty i ich klastry
documents_clusters <- data.frame(Dokument = document_names,
Klaster = as.factor(clusters))
# Podgląd
print(documents_clusters)
## Dokument
## Action__Bullet_Train2022.txt Action__Bullet_Train2022.txt
## Action__Twisters2024.txt Action__Twisters2024.txt
## Dark_Comedy__Menu2022.txt Dark_Comedy__Menu2022.txt
## Supernatural__Night_Swim2024.txt Supernatural__Night_Swim2024.txt
## Supernatural__Smile2022.txt Supernatural__Smile2022.txt
## Thriller_Psych__Heretic2024.txt Thriller_Psych__Heretic2024.txt
## Thriller_SciFi__Caddo_Lake2024.txt Thriller_SciFi__Caddo_Lake2024.txt
## Thriller_SciFi__Companion2025.txt Thriller_SciFi__Companion2025.txt
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd
## Klaster
## Action__Bullet_Train2022.txt 4
## Action__Twisters2024.txt 3
## Dark_Comedy__Menu2022.txt 2
## Supernatural__Night_Swim2024.txt 2
## Supernatural__Smile2022.txt 2
## Thriller_Psych__Heretic2024.txt 2
## Thriller_SciFi__Caddo_Lake2024.txt 2
## Thriller_SciFi__Companion2025.txt 2
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.R 1
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.R 1
## Zaj_7.BoW_Przestrzen_wektorowa_Klastrowanie_kMeans_rozne_k.spin.Rmd 1
# c) Wizualizacja przypisania dokumentów do klastrów ----
ggplot(documents_clusters, aes(x = reorder(Dokument, Klaster), fill = Klaster)) +
geom_bar(stat = "count", width = 0.7) +
coord_flip() +
labs(title = "Przypisanie dokumentów do klastrów",
x = "Dokument",
y = "Liczba wystąpień (powinna wynosić 1)",
fill = "Klaster") +
theme_minimal(base_size = 13)
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
